library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(DT)
DIST_METHOD = 'Manhattan'
errMethod = ifelse(DIST_METHOD == 'Manhattan','MAE','RMSE')
#weightFormula = function(x){ 1/(x)} #inverse of distance
weightFormula = function(x){ 1/(x ^ 2)} #inverse of sq distance
# Keep only required MAC Addresses
keepMacs = c(
#'00:0f:a3:39:e1:c0', #default
'00:0f:a3:39:dd:cd', #added
'00:14:bf:b1:97:8a',
'00:14:bf:3b:c7:c6',
'00:14:bf:b1:97:90',
'00:14:bf:b1:97:8d',
'00:14:bf:b1:97:81'
)
offline_file = "../../data/offline.final.trace.txt"
online_file = "../../data/online.final.trace.txt"
# Create a function to parse the data
processLine = function(x){
# Split up the line on ';', '=' and ','
tokens = strsplit(x, "[;=,]")[[1]]
# The hand held device (the one for which we need to determine the position)
# infromation is contained in the 1st 10 tokens (refer to book page 9)
# If no scanned signal values, return NULL
if (length(tokens) == 10) {
return(NULL)
}
# The tokens after the 10th one representthe signal strength at the access points (book page 9).
# Split up the tokens into individual measurements (each measurement contains 4 data points)
# 4 points are: MAC address, Signal, Channel and Device Type
# Device Type 3 is what is important (book page 6)
tmp = matrix(data = tokens[ - (1:10) ], ncol = 4, byrow = TRUE)
# Combine signal measurement with the h
cbind(matrix(tokens[c(2, 4, 6:8, 10)], nrow(tmp), 6, byrow = TRUE), tmp)
}
#' @description Function to read the data, clean it and process it into an appropriate format
#' @param file Filename to be read in
#' @param keepMacs a list of MAC addresses to keep
#' @returns A dataframe
readData = function(file, keepMacs=NULL){
# Read in the raw "offline" text file
txt = readLines(file)
##############################
#### Process the raw data ####
##############################
# Parse the data
lines = txt[substr(txt, 1, 1) != "#" ]
tmp = lapply(lines, processLine)
# Convert the data to a data frame
data = as.data.frame(do.call("rbind", tmp), stringsAsFactors = FALSE)
######################################################################
#### Cleaning the Data and Building a Representation for Analysis ####
######################################################################
# Assign column names to the offline data frame
names(data) = c(
"time", "scanMac", "posX", "posY", "posZ",
"orientation", "mac", "signal",
"channel", "type"
)
numVars = c("time", "posX", "posY", "posZ", "orientation", "signal")
data[numVars] = lapply(data[numVars], as.numeric)
# Keep only required device types (remove adhoc)
data = data[data$type != 1, ]
# Keep only required MAC Addresses
data = data[data$mac %in% keepMacs, ]
# # From book page 13
# data$rawTime = data$time
# data$time = data$time/1000
# class(data$time) = c("POSIXt", "POSIXct")
# Discard unwanted columns that dont add any additional information
data = data[ , !(names(data) %in% c("scanMac", "posZ"))]
# Cleanup Orientation
data$angle = roundOrientation(data$orientation)
# Add position identifier
data$posXY = paste(data$posX, data$posY, sep = "-")
return(data)
}
numMacs = length(keepMacs)
numMacs
## [1] 6
roundOrientation = function(angles) {
refs = seq(0, by = 45, length = 9)
q = sapply(angles, function(o) which.min(abs(o - refs)))
c(refs[1:8], 0)[q]
}
offline = readData(file = offline_file, keepMacs = keepMacs)
dim(offline)
## [1] 769089 10
length(unique(offline$posXY))
## [1] 166
online = readData(file = online_file, keepMacs = keepMacs)
dim(online)
## [1] 34815 10
length(unique(online$posXY))
## [1] 60
# This is equivalent to the tall2wide function
reshapeSS = function(data, varSignal = "signal", keepVars = c("posXY", "posX", "posY"), sampleAngle = FALSE) {
refs = seq(0, by = 45, length = 8)
byLocation =
with(
data,
by(
data,
list(posXY),
function(x) {
if (sampleAngle) x = x[x$angle == sample(refs, size = 1), ]
ans = x[1, keepVars]
avgSS = tapply(x[ , varSignal ], x$mac, mean)
y = matrix(avgSS, nrow = 1, ncol = numMacs,
dimnames = list(ans$posXY, names(avgSS)))
cbind(ans, y)
}
)
)
newDataSS = do.call("rbind", byLocation)
return(newDataSS)
}
keepVars = c("posXY", "posX","posY", "orientation", "angle")
onlineSummary = reshapeSS(data = online, varSignal = "signal", keepVars = keepVars)
head(onlineSummary,10)
## posXY posX posY orientation angle 00:0f:a3:39:dd:cd
## 0-0.05 0-0.05 0.00 0.05 130.5 135 -63.20721
## 0.15-9.42 0.15-9.42 0.15 9.42 112.3 90 -66.11712
## 0.31-11.09 0.31-11.09 0.31 11.09 230.1 225 -67.05405
## 0.47-8.2 0.47-8.2 0.47 8.20 5.8 0 -74.15315
## 0.78-10.94 0.78-10.94 0.78 10.94 348.3 0 -71.40367
## 0.93-11.69 0.93-11.69 0.93 11.69 158.3 180 -69.99074
## 1.08-12.19 1.08-12.19 1.08 12.19 229.1 225 -73.43750
## 1.24-3.93 1.24-3.93 1.24 3.93 261.5 270 -71.11009
## 1.39-6.61 1.39-6.61 1.39 6.61 114.1 135 -59.75676
## 1.52-9.32 1.52-9.32 1.52 9.32 7.0 0 -65.38318
## 00:14:bf:3b:c7:c6 00:14:bf:b1:97:81 00:14:bf:b1:97:8a
## 0-0.05 -62.94898 -61.81395 -40.06897
## 0.15-9.42 -73.96190 -72.70103 -47.81308
## 0.31-11.09 -70.08247 -70.09890 -54.08824
## 0.47-8.2 -64.25806 -72.59770 -45.65289
## 0.78-10.94 -66.96000 -66.80952 -48.41379
## 0.93-11.69 -70.44340 -70.58025 -43.66346
## 1.08-12.19 -69.20192 -67.92553 -52.00820
## 1.24-3.93 -69.62745 -59.76136 -38.91753
## 1.39-6.61 -62.23913 -64.56627 -48.92381
## 1.52-9.32 -63.35922 -67.48913 -50.04167
## 00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 0-0.05 -63.04301 -55.23333
## 0.15-9.42 -69.45455 -46.88000
## 0.31-11.09 -69.13158 -53.88660
## 0.47-8.2 -60.79747 -49.58000
## 0.78-10.94 -65.00000 -54.84694
## 0.93-11.69 -65.59302 -47.27083
## 1.08-12.19 -71.58696 -51.66667
## 1.24-3.93 -71.66667 -53.23333
## 1.39-6.61 -60.79798 -50.49057
## 1.52-9.32 -65.10345 -49.38542
#' @description Selectes the appropriate observations (based on test data orientation) from the original tall data
#' and reformats it such that it can be used for training the KNN algorithm
#' @param angleNewObs Angle (Orientation) of the test observation
#' @param train_data Original tall-skinny data
#' @param m Keep the 'm' closest orientations to angleNewObs
#' @returns A dataframe suitable for training
selectTrain = function(angleNewObs, train_data, m){
# Find the angles to keep
nearestAngle = roundOrientation(angles = angleNewObs)
if (m %% 2 == 1) {
angles = seq(-45 * (m - 1) /2, 45 * (m - 1) /2, length = m)
} else {
m = m + 1
angles = seq(-45 * (m - 1) /2, 45 * (m - 1) /2, length = m)
if (sign(angleNewObs - nearestAngle) > -1)
angles = angles[ -1 ]
else
angles = angles[ -m ]
}
angles = angles + nearestAngle
angles[angles < 0] = angles[ angles < 0 ] + 360
angles[angles > 360] = angles[ angles > 360 ] - 360
# Subset only those angles from original data (tall-skinny)
train_data_subset = train_data[train_data$angle %in% angles, ]
# Convert to Wide and average the data for the same positions
train_data_subset = reshapeSS(data = train_data_subset, varSignal = "signal")
return(train_data_subset)
}
#' @description Computes the distance of the new signal (single observation) to each observation in the training dataset
#' @param newSignals The Signal Values for the validation data for each observation
#' @param trainSubset The training data to be used
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A dataframe containing same number of rows as that in the training data.
#' The observations are ordered by the distance to the new signal. Each row contains 5 columns.
#' 1st column is the XY location of the training observation (string)
#' 2nd column is the X location of the training observation (float)
#' 3rd column is the Y location of the training observation (float)
#' 4th column is the distance to the point under consideration to the training observation (float)
#' 5th column is the inverse distance or weight (float). Weight is hard coded to 1 for all observations if weighted = FALSE
findNN = function(newSignal, trainSubset, weighted=FALSE, method = DIST_METHOD) {
diffs = apply(trainSubset[ , 4:(4+numMacs-1)], 1, function(x) x - newSignal)
if(method=='Euclidian') dists = apply(diffs, 2, function(x) sqrt(sum(x^2)) ) #RSE
if(method=='Manhattan') dists = apply(diffs, 2, function(x) sum(abs(x)) ) #AE
closest = order(dists)
ordered_dist = dists[closest]
if(weighted == TRUE){
weight = weightFormula(ordered_dist)
}
if(weighted == FALSE){
weight = rep(1, length(dists))
}
return(cbind(trainSubset[closest, 1:3], ordered_dist, weight))
}
#' @description XY Prediction for a single value of k (num neighbors)
#' @param newSignals The Signal Values for the validation data for each observation
#' @param newAngles The Orientation of the validation data for each observation
#' @param trainData The training data to be used
#' @param numAngles Number of closest reference angles to include in the data
#' @param k Perform the predicton for num neighbors = k
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A dataframe with num rows = number of (validation) observations and num columns = 2
#' Each row indicates the prediction of the mean X and Y values for that observation
predXY = function(newSignals, newAngles, trainData, numAngles = 1, k = 3, weighted=FALSE){
closeXY = list(length = nrow(newSignals))
for (i in 1:nrow(newSignals)) {
trainSS = selectTrain(newAngles[i], trainData, m = numAngles)
closeXY[[i]] = findNN(
newSignal = as.numeric(newSignals[i, ]),
trainSubset = trainSS,
weighted = weighted
)
}
#' @description Returns the (un)weighted mean X and Y locations for a single observation and single value of neighbors
#' @param x Dataframe containing 5 columns
#' 1st column is the XY location (string)
#' 2nd column is the X location (float)
#' 3rd column is the Y location (float)
#' 4th column is the distance to the point under consideration (float)
#' 5th column is the inverse distance or weight (float)
#' @param k Number of nearest neighbors to use
#' @return A pair of XY mean values for k number of neighbors
k_means_single_obs = function(x, k){
weights = x[1:k, 5]
weighted_x = sum(x[1:k, 2] * weights) / sum(weights)
weighted_y = sum(x[1:k, 3] * weights) / sum(weights)
return(c(weighted_x, weighted_y))
}
# estXY = lapply(closeXY, function(x) sapply(x[ , 2:3], function(x) mean(x[1:k])))
estXY = lapply(closeXY, k_means_single_obs, k)
estXY = do.call("rbind", estXY)
return(estXY)
}
calcError = function(estXY, actualXY, method = DIST_METHOD){
if('numeric' %in% class(estXY)) rows = 1 else rows = nrow(estXY)
if(method == 'Euclidean') er = sqrt(sum(rowSums((estXY - actualXY)^2)))/rows
if(method == 'Manhattan') er = sum(rowSums(abs(estXY - actualXY)))/rows
return(er)
}
set.seed(42)
K = 20
v = 11
allNeighbors = c(1:K)
allNeighbors
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
allAngles = c(1:8)
allAngles
## [1] 1 2 3 4 5 6 7 8
permuteLocs = sample(unique(offline$posXY))
permuteLocs = matrix(permuteLocs, ncol = v, nrow = floor(length(permuteLocs)/v))
## Warning in matrix(permuteLocs, ncol = v, nrow = floor(length(permuteLocs)/v)):
## data length [166] is not a sub-multiple or multiple of the number of rows [15]
permuteLocs
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] "11-3" "24-6" "27-8" "25-8" "7-7" "24-8" "22-8" "12-5" "0-12" "0-7"
## [2,] "9-7" "21-6" "2-9" "24-7" "5-8" "15-8" "11-8" "32-6" "2-7" "29-8"
## [3,] "14-7" "30-3" "12-7" "23-4" "2-2" "10-5" "4-7" "17-8" "0-11" "17-7"
## [4,] "13-8" "3-3" "32-8" "30-7" "2-8" "33-7" "25-3" "1-13" "12-6" "24-3"
## [5,] "33-8" "15-7" "30-8" "0-3" "20-3" "21-3" "2-11" "13-3" "8-8" "22-3"
## [6,] "21-8" "0-8" "0-1" "2-13" "1-5" "0-4" "0-0" "23-3" "1-10" "16-8"
## [7,] "20-8" "24-5" "26-3" "31-8" "31-3" "10-4" "3-8" "2-5" "8-7" "0-9"
## [8,] "27-3" "1-1" "32-3" "2-6" "13-7" "11-4" "26-6" "8-3" "26-4" "1-3"
## [9,] "9-3" "19-3" "2-1" "14-8" "33-4" "6-8" "5-7" "1-4" "13-6" "4-8"
## [10,] "1-12" "25-4" "20-7" "24-4" "26-7" "0-13" "22-4" "19-7" "33-3" "18-8"
## [11,] "10-8" "2-0" "1-0" "16-7" "7-8" "3-7" "6-7" "26-8" "2-4" "32-5"
## [12,] "21-4" "11-5" "21-7" "27-7" "32-7" "13-5" "15-3" "9-4" "7-3" "22-7"
## [13,] "16-3" "4-3" "28-8" "21-5" "1-6" "22-5" "19-8" "10-7" "1-11" "25-7"
## [14,] "26-5" "18-7" "23-7" "12-8" "29-3" "23-5" "13-4" "2-10" "2-3" "10-3"
## [15,] "1-8" "5-3" "1-9" "0-10" "12-4" "31-7" "28-3" "11-7" "23-6" "28-7"
## [,11]
## [1,] "10-6"
## [2,] "1-7"
## [3,] "23-8"
## [4,] "11-6"
## [5,] "14-3"
## [6,] "12-3"
## [7,] "29-7"
## [8,] "22-6"
## [9,] "18-3"
## [10,] "17-3"
## [11,] "9-8"
## [12,] "6-3"
## [13,] "32-4"
## [14,] "0-2"
## [15,] "1-2"
onlineFold = subset(offline, posXY %in% permuteLocs[ , 1])
head(onlineFold)
## time posX posY orientation mac signal channel
## 154732 1.139653e+12 1 8 0.7 00:14:bf:b1:97:8a -47 2437000000
## 154733 1.139653e+12 1 8 0.7 00:14:bf:b1:97:8d -52 2442000000
## 154735 1.139653e+12 1 8 0.7 00:14:bf:b1:97:90 -51 2427000000
## 154736 1.139653e+12 1 8 0.7 00:14:bf:3b:c7:c6 -68 2432000000
## 154737 1.139653e+12 1 8 0.7 00:14:bf:b1:97:81 -67 2422000000
## 154738 1.139653e+12 1 8 0.7 00:0f:a3:39:dd:cd -72 2412000000
## type angle posXY
## 154732 3 0 1-8
## 154733 3 0 1-8
## 154735 3 0 1-8
## 154736 3 0 1-8
## 154737 3 0 1-8
## 154738 3 0 1-8
# For reference
head(onlineSummary)
## posXY posX posY orientation angle 00:0f:a3:39:dd:cd
## 0-0.05 0-0.05 0.00 0.05 130.5 135 -63.20721
## 0.15-9.42 0.15-9.42 0.15 9.42 112.3 90 -66.11712
## 0.31-11.09 0.31-11.09 0.31 11.09 230.1 225 -67.05405
## 0.47-8.2 0.47-8.2 0.47 8.20 5.8 0 -74.15315
## 0.78-10.94 0.78-10.94 0.78 10.94 348.3 0 -71.40367
## 0.93-11.69 0.93-11.69 0.93 11.69 158.3 180 -69.99074
## 00:14:bf:3b:c7:c6 00:14:bf:b1:97:81 00:14:bf:b1:97:8a
## 0-0.05 -62.94898 -61.81395 -40.06897
## 0.15-9.42 -73.96190 -72.70103 -47.81308
## 0.31-11.09 -70.08247 -70.09890 -54.08824
## 0.47-8.2 -64.25806 -72.59770 -45.65289
## 0.78-10.94 -66.96000 -66.80952 -48.41379
## 0.93-11.69 -70.44340 -70.58025 -43.66346
## 00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 0-0.05 -63.04301 -55.23333
## 0.15-9.42 -69.45455 -46.88000
## 0.31-11.09 -69.13158 -53.88660
## 0.47-8.2 -60.79747 -49.58000
## 0.78-10.94 -65.00000 -54.84694
## 0.93-11.69 -65.59302 -47.27083
keepVars = c("posXY", "posX","posY", "orientation", "angle")
onlineCVSummary = reshapeSS(offline, keepVars = keepVars, sampleAngle = TRUE)
head(onlineCVSummary,01)
## posXY posX posY orientation angle 00:0f:a3:39:dd:cd 00:14:bf:3b:c7:c6
## 0-0 0-0 0 0 90.3 90 -73.9633 -62.59551
## 00:14:bf:b1:97:81 00:14:bf:b1:97:8a 00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 0-0 -63.78261 -33.74737 -63.12941 -55.19588
# First Fold (validation)
onlineFold = subset(onlineCVSummary, posXY %in% permuteLocs[ , 1])
head(onlineFold)
## posXY posX posY orientation angle 00:0f:a3:39:dd:cd 00:14:bf:3b:c7:c6
## 1-12 1-12 1 12 359.9 0 -71.78182 -67.88785
## 1-8 1-8 1 8 270.2 270 -69.76364 -65.55435
## 10-8 10-8 10 8 135.1 135 -61.20909 -57.16327
## 11-3 11-3 11 3 359.9 0 -63.09910 -54.42857
## 13-8 13-8 13 8 90.3 90 -65.99115 -57.77174
## 14-7 14-7 14 7 180.2 180 -69.66972 -58.02273
## 00:14:bf:b1:97:81 00:14:bf:b1:97:8a 00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 1-12 -71.93617 -42.65979 -63.21978 -44.97872
## 1-8 -59.13793 -50.12766 -49.57143 -55.23469
## 10-8 -59.56122 -58.85149 -53.18000 -67.21429
## 11-3 -53.75000 -52.00926 -62.31707 -70.51685
## 13-8 -62.31313 -58.43000 -53.05495 -64.02198
## 14-7 -57.19318 -58.09278 -37.25000 -67.87209
# First Fold (Train)
offlineFold = subset(offline, posXY %in% permuteLocs[ , -1])
head(offlineFold)
## time posX posY orientation mac signal channel type
## 1 1.139643e+12 0 0 0 00:14:bf:b1:97:8a -38 2437000000 3
## 2 1.139643e+12 0 0 0 00:14:bf:b1:97:90 -56 2427000000 3
## 4 1.139643e+12 0 0 0 00:14:bf:b1:97:8d -65 2442000000 3
## 5 1.139643e+12 0 0 0 00:14:bf:b1:97:81 -65 2422000000 3
## 6 1.139643e+12 0 0 0 00:14:bf:3b:c7:c6 -66 2432000000 3
## 7 1.139643e+12 0 0 0 00:0f:a3:39:dd:cd -75 2412000000 3
## angle posXY
## 1 0 0-0
## 2 0 0-0
## 4 0 0-0
## 5 0 0-0
## 6 0 0-0
## 7 0 0-0
estFold = predXY(
newSignals = onlineFold[ , 6:(6+numMacs-1)],
newAngles = onlineFold[ , 4],
offlineFold,
numAngles = 3,
k = 3
)
actualFold = onlineFold[ , c("posX", "posY")]
calcError(estFold, actualFold)
## [1] 2.266667
#' @description Modified XY Prediction to help with faster CV for all K values at once (from 1 to K)
#' @param newSignals The Signal Values for the validation data for each observation
#' @param newAngles The Orientation of the validation data for each observation
#' @param trainData The training data to be used
#' @param numAngles Number of closest reference angles to include in the data
#' @param K Perform the prediction for num neighbors from 1 to K
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A nested dataframe with num rows = number of (validation) observations and num columns = number of folds
#' Each entry in this dataframe is a vector of 2 values
#' indicating the prediction of the mean X and Y values for that obs and num neighbors
predXYallK = function(newSignals, newAngles, trainData, numAngles = 1, K = 10, weighted=FALSE){
closeXY = list(length = nrow(newSignals))
for (i in 1:nrow(newSignals)) {
trainSS = selectTrain(newAngles[i], trainData, m = numAngles)
closeXY[[i]] = findNN(
newSignal = as.numeric(newSignals[i, ]),
trainSubset = trainSS,
weighted = weighted
)
}
#' @description Returns the (un)weighted mean X and Y locations for a single observation and multiple neighor values
#' @param x Dataframe containing 5 columns
#' 1st column is the XY location (string)
#' 2nd column is the X location (float)
#' 3rd column is the Y location (float)
#' 4th column is the distance to the point under consideration (float)
#' 5th column is the inverse distance or weight (float)
#' @param K Number of nearest neighbors to use
#' @return A list of K pairs (each pair is a XY mean value for a single k)
all_K_means_single_obs = function(x, K){
# Row will contain the K mean values for k = 1 to K
rows = list()
for(k in seq(1, K)){
rows[[k]] = k_means_single_obs(x, k)
}
return(rows)
}
#' @description Returns the (un)weighted mean X and Y locations for a single observation and single value of neighbors
#' @param x Dataframe containing 5 columns
#' 1st column is the XY location (string)
#' 2nd column is the X location (float)
#' 3rd column is the Y location (float)
#' 4th column is the distance to the point under consideration (float)
#' 5th column is the inverse distance or weight (float)
#' @param k Number of nearest neighbors to use
#' @return A pair of XY mean values for k number of neighbors
k_means_single_obs = function(x, k){
weights = x[1:k, 5]
weighted_x = sum(x[1:k, 2] * weights) / sum(weights)
weighted_y = sum(x[1:k, 3] * weights) / sum(weights)
return(c(weighted_x, weighted_y))
}
# estXY = lapply(closeXY, function(x) sapply(x[ , 2:3], function(x) mean(x[1:k])))
estXY = lapply(closeXY, all_K_means_single_obs, K)
estXY = do.call("rbind", estXY)
return(estXY)
}
#' @description Returns the (un)weighted mean X and Y locations for a single observation and multiple neighor values
#' @param K Number of nearest neighbors to use (Will run Grid Search over all values from k = 1 to K)
#' @param v Number of folds to use
#' @param offline Use "as is" from script for now
#' @param onlineCVSummary Use "as is" from script for now
#' @param folds A matrix with rows = number of observations in each fold and columns = number of folds.
#' The values are the XY IDs to be included in that fold
#' @param numAngles Number of closest reference angles to include in the data
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A vector of K values indicating the Error for each value of k from 1 to K
run_kfold = function(K, v, offline, onlineCVSummary, folds, numAngles, weighted=FALSE){
err= rep(0, K)
errCV = rep(0, K)
allErr = data.frame()
for (j in 1:v) {
print(paste("Running Fold: ", j))
onlineFold = subset(onlineCVSummary, posXY %in% folds[ , j])
offlineFold = subset(offline, posXY %in% folds[ , -j])
actualFold = onlineFold[ , c("posX", "posY")]
estFold = predXYallK(
newSignals = onlineFold[ , 6:(6+numMacs-1)],
newAngles = onlineFold[ , 4],
trainData = offlineFold,
numAngles = numAngles,
K = K,
weighted=weighted
)
# Reformat into correct format for each 'k' value
for(k in 1:K){
estSingleK = data.frame()
for(i in seq(1, length(estFold)/K)){ # i = NUmber of the observtion
estSingleK = rbind(estSingleK, t(as.data.frame(estFold[i,k])))
}
err[k] = err[k] + calcError(estSingleK, actualFold)
errCV[k] = calcError(estSingleK, actualFold) #returning all folds
}
allErr=rbind(allErr,data.frame('fold'=j, 'numNeighbors' = 1:K,'errValue' = errCV))
}
return(list(err=err,allErr=allErr))
}
get_CV = function(K,v,offline,onlineCVSummary,permuteLocs,numAngles,weighted = TRUE){
library(foreach)
library(progress)
library(doParallel)
library(doSNOW)
cl <- makeCluster(detectCores())
doSNOW::registerDoSNOW(cl)
allErrors = data.frame()
start = proc.time()
pb <- progress::progress_bar$new(total = length(allAngles),format='[:bar] :percent :eta')
progress <- function(n) pb$tick()
allErrorsCV = foreach(numAngles = allAngles
,.combine = rbind
,.options.snow = list(progress=progress)
,.export = c('run_kfold','predXYallK','reshapeSS','findNN','calcError'
,'numMacs','selectTrain','roundOrientation','DIST_METHOD'
,'weightFormula')
) %dopar% {
print(paste("Running ", v, "-fold cross validation with 1 to ", K, " neighbors, for number of Angles = ", numAngles))
err = run_kfold(
K = K,
v = v,
offline = offline,
onlineCVSummary = onlineCVSummary,
folds = permuteLocs,
numAngles = numAngles,
weighted = weighted
)
err$allErr$numAngles = numAngles
return(err$allErr)
#return(data.frame(t(err)))
}
stopCluster(cl)
stop = proc.time()
diff = stop-start
print(diff)
return(allErrorsCV)
}
find_best = function(allErrorsCV){
library('caret')
library(tidyverse)
allErrors = allErrorsCV %>%
group_by(numAngles,numNeighbors) %>%
dplyr::summarise(errValue = mean(errValue)) %>%
ungroup() %>%
mutate(errValueSD=sd(errValue)
,best=FALSE
,oneSE=FALSE)
allErrors[best(as.data.frame(allErrors),"errValue",maximize=FALSE),]$best=TRUE
allErrors[oneSE(as.data.frame(allErrors),"errValue",maximize=FALSE,num=30),]$oneSE=TRUE
return(allErrors)
}
plot_best = function(allErrors) {
p = ggplot(allErrors, aes(x=numNeighbors, y=numAngles, fill= errValue
, text=paste0("A:",numAngles," N:",numNeighbors,errMethod," :", round(errValue,3)))) +
geom_tile() +
scale_y_continuous(breaks=allAngles) +
#scale_fill_distiller(palette = "RdYlBu") +
scale_fill_gradient2(low = "green",mid='darkorange', high = "darkred", na.value = NA
,midpoint=mean(c(max(allErrors$errValue),min(allErrors$errValue)))
#,midpoint=median(Errors$errValue)
)+
#scale_fill_distiller(palette = "Blues",direction=0) +
labs(fill = errMethod) +
geom_text(data=allErrors[allErrors$best,],label='Best',size=3,nudge_y=.27) +
geom_text(data=allErrors[allErrors$oneSE,],label='1SE',size=3)
#p
ggplotly(p, tooltip="text")
}
floorErrorMap = function(estXY, actualXY,method = DIST_METHOD){
if(method == 'Euclidean') er = sqrt(rowSums((estXY - actualXY)^2))
if(method == 'Manhattan') er = rowSums(abs(estXY - actualXY))
data = cbind(actualXY,er)
library(png)
library(ggpubr)
img <- png::readPNG('building.png')
p = ggplot(data=data,aes(x=posX,y=posY,color=er)) +
background_image(img) +
labs(x='X',y='Y',color='Error')+
ggplot2::scale_y_continuous(limits=c(-3.1,14.4))+
ggplot2::scale_x_continuous(limits=c(0,33.4))+
ggplot2::geom_point(shape=16,size=4) +
scale_color_gradient2(low = "green",mid='darkorange', high = "darkred", na.value = NA
,midpoint=mean(c(max(data$er),min(data$er)))
#,midpoint=median(Errors$errValue)
)
return(p)
}
allErrorsCV = get_CV(K=K,v=v
,offline=offline,onlineCVSummary=onlineCVSummary
,permuteLocs=permuteLocs,numAngles=numAngles
,weighted = FALSE)
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
## Loading required package: snow
##
## Attaching package: 'snow'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, clusterSplit, makeCluster, parApply,
## parCapply, parLapply, parRapply, parSapply, splitIndices,
## stopCluster
## user system elapsed
## 2.61 2.59 130.82
allErrors = find_best(allErrorsCV)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## `summarise()` regrouping output by 'numAngles' (override with `.groups` argument)
allErrors
## # A tibble: 160 x 6
## numAngles numNeighbors errValue errValueSD best oneSE
## <int> <int> <dbl> <dbl> <lgl> <lgl>
## 1 1 1 3.42 0.200 FALSE FALSE
## 2 1 2 2.97 0.200 FALSE FALSE
## 3 1 3 2.80 0.200 FALSE FALSE
## 4 1 4 2.88 0.200 FALSE FALSE
## 5 1 5 2.77 0.200 FALSE FALSE
## 6 1 6 2.82 0.200 FALSE FALSE
## 7 1 7 2.83 0.200 FALSE FALSE
## 8 1 8 2.88 0.200 FALSE FALSE
## 9 1 9 2.87 0.200 FALSE FALSE
## 10 1 10 2.90 0.200 FALSE FALSE
## # ... with 150 more rows
print(filter(allErrors,best | oneSE))
## # A tibble: 1 x 6
## numAngles numNeighbors errValue errValueSD best oneSE
## <int> <int> <dbl> <dbl> <lgl> <lgl>
## 1 6 4 2.47 0.200 TRUE TRUE
plot_best(allErrors)
final = filter(allErrors,oneSE)
finalAngle = final$numAngles
finalK = final$numNeighbors
finalAngle
## [1] 6
finalK
## [1] 4
#
# final = which(allErrors == min(allErrors), arr.ind = TRUE)
#
#
# finalAngleIndex = final[1]
# finalKIndex = final[2]
# finalAngle = allAngles[finalAngleIndex]
# finalK = allNeighbors[finalKIndex]
#
# finalAngle
# finalK
actualXY = onlineSummary %>% dplyr::select(posX, posY)
estXYfinalK = predXY(
newSignals = onlineSummary[ , 6:(6+numMacs-1)],
newAngles = onlineSummary[ , 4],
trainData = offline,
numAngles = finalAngle,
k = finalK,
weighted = FALSE
)
calcError(estXYfinalK, actualXY)
## [1] 2.276667
floorErrorMap(estXY = estXYfinalK, actualXY = actualXY)
allErrorsCVW = get_CV(K=K,v=v
,offline=offline,onlineCVSummary=onlineCVSummary
,permuteLocs=permuteLocs,numAngles=numAngles
,weighted = TRUE)
## user system elapsed
## 3.11 2.66 142.86
allErrorsW = find_best(allErrorsCVW)
## `summarise()` regrouping output by 'numAngles' (override with `.groups` argument)
allErrorsW
## # A tibble: 160 x 6
## numAngles numNeighbors errValue errValueSD best oneSE
## <int> <int> <dbl> <dbl> <lgl> <lgl>
## 1 1 1 3.42 0.146 FALSE FALSE
## 2 1 2 2.98 0.146 FALSE FALSE
## 3 1 3 2.82 0.146 FALSE FALSE
## 4 1 4 2.82 0.146 FALSE FALSE
## 5 1 5 2.69 0.146 FALSE FALSE
## 6 1 6 2.72 0.146 FALSE FALSE
## 7 1 7 2.72 0.146 FALSE FALSE
## 8 1 8 2.75 0.146 FALSE FALSE
## 9 1 9 2.73 0.146 FALSE FALSE
## 10 1 10 2.76 0.146 FALSE FALSE
## # ... with 150 more rows
print(filter(allErrorsW,best | oneSE))
## # A tibble: 1 x 6
## numAngles numNeighbors errValue errValueSD best oneSE
## <int> <int> <dbl> <dbl> <lgl> <lgl>
## 1 6 4 2.48 0.146 TRUE TRUE
plot_best(allErrorsW)
finalW = filter(allErrorsW,oneSE)
finalAngleW = final$numAngles
finalKW = final$numNeighbors
finalAngleW
## [1] 6
finalKW
## [1] 4
actualXY = onlineSummary %>% dplyr::select(posX, posY)
estXYfinalKW = predXY(
newSignals = onlineSummary[ , 6:(6+numMacs-1)],
newAngles = onlineSummary[ , 4],
trainData = offline,
numAngles = finalAngleW,
k = finalKW,
weighted = TRUE
)
calcError(estXYfinalKW, actualXY)
## [1] 2.278147
floorErrorMap(estXY = estXYfinalKW, actualXY = actualXY)
# time of offline and online measures
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
offlineTime = offline$time /1000
class(offlineTime) = c("POSIXt", "POSIXct")
dfOffT= data.frame('hour'=hour(offlineTime) + lubridate::minute(offlineTime)/60) %>% count(hour)
dfOffT$type = 'Offline'
onlineTime = online$time /1000
class(onlineTime) = c("POSIXt", "POSIXct")
dfOnT= data.frame('hour'=hour(onlineTime)+ lubridate::minute(onlineTime)/60) %>% count(hour)
dfOnT$type = 'Online'
dfTime = rbind(dfOffT, dfOnT)
ggplot(dfTime,aes(x=hour,y=n)) + geom_point() + ggtitle('Time of the day when measures where taken') +
facet_wrap(vars(type)) + labs(x='Hour of the Day',y='Count of measures')